# ========================================================================= #
# Project: Lexical Ambiguity in Political Rhetoric (BJPolS)
# - Script: Load raw data, preprocess text, export for analysis
# - Author: Patrick Kraft (patrickwilli.kraft@uc3m.es)
# ========================================================================= #


# Load packages and custom functions --------------------------------------

source(here::here("code/00-func.R"))


# Load dictionary ---------------------------------------------------------

dict <- dictionary(file=here("in/mfd2.0.dic"), format="LIWC")


# Load speeches, convert to common tibble format --------------------------

## US SOTU addresses (from quanteda.corpora)
sotu <- quanteda.corpora::data_corpus_sotu %>%
  corpus_subset(Date > as.Date("1949-12-31") & Date < as.Date("2020-12-31"))
sotu <- tibble(type = "a) US State of the Union",
               doc_id = docnames(sotu),
               year = as.numeric(format(docvars(sotu)$Date,"%Y")),
               party = factor(docvars(sotu)$party, 
                              levels = c("Democratic","Republican")),
               speaker = docvars(sotu)$President,
               text = enc2utf8(as.character(sotu)))

## UK Queen's speeches
qs <- readtext(here("in/UK_QueensSpeech/QUE*.txt")) %>%
  as_tibble() %>%
  mutate(type = "b) UK Queen's Speeches",
         text = enc2utf8(text),
         year = as.numeric(gsub("\\D+","", doc_id))) %>%
  left_join(read_csv(here("in/UK_QueensSpeech/qs_info.csv"))) %>%
  mutate(party = recode_factor(party,
                               "LAB" = "Labour",
                               "CON" = "Conservative")) %>%
  select(type, doc_id, year, party, speaker, text)
  
## US party convention speeches
co <- readtext(here("in/US_PartyConvention/*.txt")) %>%
  as_tibble() %>%
  separate(doc_id, sep = "_", 
           into = c("party", "year", "speaker"), 
           remove = FALSE) %>%
  mutate(type = "c) US Convention Speeches",
         text = enc2utf8(text),
         speaker = str_remove(speaker, ".txt"),
         party = recode_factor(party,
                               "D" = "Democratic",
                               "R" = "Republican"),
         year = as.numeric(year)) %>%
  select(type, doc_id, year, party, speaker, text) %>%
  arrange(year)

## UK party leader speeches
pl <- readtext(here("in/UK_PartyLeader/*.txt")) %>%
  as_tibble() %>%
  mutate(type = "d) UK Party Leader Speeches",
         text = enc2utf8(text),
         party = recode_factor(substr(doc_id, 1, 3),
                               "LAB" = "Labour",
                               "CON" = "Conservative", 
                               "LIB" = "Liberal"),
         year = as.numeric(substr(doc_id, 5, 8)),
         speaker = NA) %>%
  select(type, doc_id, year, party, speaker, text) %>%
  arrange(year)

## US presidential & vice-presidential debates
deb_info <- read_csv("in/US_PresidentialDebates/deb_info.csv")
deb <- readtext("in/US_PresidentialDebates/us*.txt") %>%
  mutate(text = gsub("\nM(R|r|S|s)\\. ", "\n", text),             # remove MR./MS. at beginning of para
         text = gsub("^M(R|r|S|s)\\. ", "", text),                # remove MR./MS. at beginning of string
         text = gsub("(\\(|\\[)\\w+(\\)|\\])", " ", text),        # remove applause and other comments
         year = gsub("(us|(pr\\w+.txt|vp\\w+.txt))", "", doc_id), # add year variable
         pres = grepl("pres", doc_id),                            # presidential (vs. vp) debate
         rep = NA_character_,                                     # statements by Republican candidate
         dem = NA_character_                                      # statements by Democratic candidate
  ) %>% as_tibble()

## extract and combine candidate statements
for(i in 1:nrow(deb)){
  deb[i, c("rep","dem")] <- deb$text[i] %>%
    str_split("\n\n") %>%                                # split individual statements
    map_df(~data.frame(str_split_fixed(., ": ", 2))) %>% # extract speaker info
    as_tibble() %>%
    transmute(speaker = as.character(X1),
              text = as.character(X2),
              dem = speaker %in% filter(deb_info, year == deb$year[i])$dem) %>%   # Democratic candidates
    filter(dem | (speaker %in% c(filter(deb_info, year == deb$year[i])$rep))) %>% # remove moderators
    group_by(dem) %>%
    summarise(text = paste(text, collapse = " ")) %>%
    pivot_wider(names_from = dem, values_from = text)
}

## reshape tibble to common format
deb <- deb %>%
  select(-text) %>%
  pivot_longer(cols = rep:dem, names_to = "party", values_to = "text") %>% 
  mutate(doc_id = paste0(party, "_", doc_id),
         year = as.numeric(year),
         type = "e) US Presidential Debates",
         text = enc2utf8(text),
         speaker = ifelse(pres, "President", "Vice President"),
         party = recode_factor(party,
                               "dem" = "Democratic",
                               "rep" = "Republican")) %>%
  select(type, doc_id, year, party, speaker, text)

## US Senate emails
set.seed(42)
email <- read_csv(here("in/US_Emails/dcinbox_export_senate2010-2020.csv")) %>% 
  filter(Party != "Independent") %>% 
  transmute(type = "f) US Senate Emails",
            doc_id = as.character(ID),
            year = as.POSIXct(`Unix Timestamp`/1000, origin="1970-01-01"),
            year = format(year, format = "%Y"),
            year = as.numeric(year),
            party = recode_factor(Party,
                                  `Democrat` = "Democratic",
                                  `Republican` = "Republican"),
            speaker = paste0(`Last Name`, ", ", `First Name`),
            text = enc2utf8(Body)) %>% 
  filter(!is.na(text), year == 2020) %>%
  slice_sample(n = 1000) ## use sample of 1000 emails


# Preprocess sentences ----------------------------------------------------

## Remove special characters etc.
df <- bind_rows(sotu, qs, co, pl, deb, email) %>%
  mutate(
    text = enc2utf8(text),
    text = gsub("Mr.", "Mr", text),
    text = gsub("Ms.", "Ms", text),
    text = gsub("Mrs.", "Mrs", text),
    text = gsub("St.", "St", text),
    text = gsub("Dr.", "Dr", text),
    text = gsub("(\\(|\\[)(A|a)pplause\\S*(\\)|\\])\\S*"," ", text),
    text = gsub("§\\s+\"*", " ", text),
    text = gsub("…", " ", text),
    text = gsub("<\\w\\w>", " ", text),            # remove special characters encoded as e.g. <92>
    text = gsub("�"," ", text),                    # remove special characters encoded as �
    text = gsub("(\\[\\]|\\(\\))", " ", text),     # replace empty parentheses
    text = gsub("\n(-+|•)", " ", text),            # remove bullet points
    text = gsub("\\<\\d+\\>", " ", text),          # remove digits
    text = gsub("(:|;)(\\s+|\n+)", ". ", text),    # replace colon & semicolon and with period
    text = gsub(", \n*([A-HJ-Z])", ". \\1", text), # replace instances where comma was used as period
    text = gsub("\\.(\\s+|\\s*\n+)([a-z])", ". \\U\\2", text, perl = TRUE) # upper case after period
  ) %>%
  unnest_tokens(sentence, text, token = "sentences") %>%
  mutate(sentence_id = row_number())

## Only keep sentences with >5 words
df <- df %>% 
  unnest_tokens(word, sentence, token = "words") %>%
  count(sentence_id) %>%
  right_join(df) %>%
  filter(n > 5)


# Export for analysis -----------------------------------------------------

save(dict, sotu, qs, co, pl, deb, email,
     file = here("out/speeches.Rdata"))
write_csv(df, here("out/sentences.csv"))
write_lines(df$sentence, here("out/sentences.txt"))
